# Install libraries
#!pip install pandas numpy matplotlib seaborn
#!pip install plotly
#!pip install nbformat
# Import libraries
import pandas as pd
pd.set_option('display.max_rows', 50)
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib
import datetime as dt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")
import plotly
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
plotly.offline.init_notebook_mode(connected=True)
# Read dataset
df = pd.read_csv("ecommerce_dataset.csv", encoding="latin1")
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Date'] = df['InvoiceDate'].dt.date
df['Month'] = df['InvoiceDate'].dt.month
df['Year'] = df['InvoiceDate'].dt.year
df.head()
# Because it is transactions dataset, there are unnecessary extras rows.
regex = "^(gift|DCGS|POST|D|C2|DOT|M|BANK CHARGES|S|AMAZONFEE|PADS|B|CRUK)"
series = df['StockCode']
df2 = df[~series.str.contains(regex)]
df2['Quantity'].describe()
# Assumption: Quantity is negative when it is a return, thereby we should remove from the dataset.
df3 = df2[df2['Quantity'] > 0]
df3 = df3[df3['UnitPrice'] > 0]
df3.shape
fig = px.box(df3, y="UnitPrice")
fig.show()
df3[df3['UnitPrice'] > 649]
# United Kingdom represents almost entire dataset.
df3['Country'].value_counts(normalize = True)[:10]
df4 = df3[df3['Country'] == "United Kingdom"]
df4.shape
# Percentual of missing values
df4.isna().mean().round(4) * 100
df5 = df4.dropna()
df5.shape
freq_Stock = df5.groupby(['StockCode'])['StockCode'].agg('count').to_frame()
freq_Stock.columns = ['count']
freq_Stock = freq_Stock.reset_index()
freq_Stock = freq_Stock.sort_values(['count'], ascending = False)
freq_Stock
# Removendo linhas com menos de 10 examplos
remove_Stock = freq_Stock[freq_Stock['count'] < 10]['StockCode']
df6 = df5[~df5['StockCode'].isin(remove_Stock)]
df6.shape
# Values extremely different from others.
%%timeit
def remove_outilers(df):
'''This function only keep values from its mean minus a standard deviation to
mean plus a standard deviation by Country and Product.
Input: dataframe
Output: dataframe
'''
df_out = pd.DataFrame()
for key, subdf in df.groupby(['Country', 'StockCode']):
m = np.mean(subdf.UnitPrice)
st = np.std(subdf.UnitPrice)
reduced_df = subdf[(subdf.UnitPrice > (m-st)) & (subdf.UnitPrice <= (m+st))]
df_out = pd.concat([df_out, reduced_df], ignore_index=True)
return df_out
df7 = remove_outilers(df6)
df7.shape
fig = px.box(df7, y="UnitPrice")
fig.show()
df7[df7['UnitPrice'] == 165].head(2)
df7[df7['UnitPrice'] == 0.04].head(2)
df8 = df7.groupby(['Description', 'UnitPrice'])['Quantity'].agg(np.sum).reset_index()
df8['Total'] = df8['Quantity'] * df8['UnitPrice']
df8 = df8.sort_values(['Total'], ascending = False)
df8 = df8[:10]
fig = px.bar(df8, x='Description', y='Total')
fig.show()
df9 = df7.groupby(['CustomerID']).agg({'Quantity': np.sum, 'UnitPrice': np.mean}).reset_index()
df9['Total'] = df9['Quantity'] * df9['UnitPrice']
df9 = df9.sort_values(['Total'], ascending = False)
df9 = df9[:10]
df9['CustomerID'] = df9['CustomerID'].astype('int')
df9['CustomerID'] = df9['CustomerID'].astype('object')
df9['CustomerID'] = df9['CustomerID'].apply(lambda x: "Client " + str(x))
fig = px.bar(df9, x='CustomerID', y='Total')
fig.show()
df10 = df7.groupby(['Month', 'Year']).agg({'Quantity': np.sum, 'UnitPrice': np.mean}).reset_index()
df10['Total'] = df10['Quantity'] * df10['UnitPrice']
df10 = df10.sort_values(['Year', 'Month'], ascending = True)
meses= df10[1:].Month
last_value = df10.loc[df10['Year'] == 2010, :]['Total']
last_value = last_value.to_list()
ini_value = [0 for i in range(11)]
fig = go.Figure(data=[
go.Bar(name='2010', x=meses, y= ini_value + last_value),
go.Bar(name='2011', x=meses, y= df10[1:].Total.to_list())
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
def get_month(x): return dt.datetime(x.year, x.month, 1)
df11 = df7.copy()
df11['InvoiceMonth'] = df7['InvoiceDate'].apply(get_month)
grouping = df11.groupby('CustomerID')['InvoiceMonth']
df11['CohortMonth'] = grouping.transform('min')
df11.head()
def get_date_int(df, column):
year = df[column].dt.year
month = df[column].dt.month
day = df[column].dt.day
return year, month, day
invoice_year, invoice_month, _ = get_date_int(df11, 'InvoiceMonth')
cohort_year, cohort_month, _ = get_date_int(df11, 'CohortMonth')
years_diff = invoice_year - cohort_year
months_diff = invoice_month - cohort_month
df11['CohortIndex'] = years_diff * 12 + months_diff + 1 # convert to month
df11.head()
grouping = df11.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique)
cohort_data = cohort_data.reset_index()
cohort_counts = cohort_data.pivot(index = 'CohortMonth',
columns = 'CohortIndex',
values = 'CustomerID')
cohort_counts.index = cohort_counts.index.astype('str')
# compare over time as well as across cohorts
cohort_sizes = cohort_counts.iloc[:,0] # select first column
retention = cohort_counts.divide(cohort_sizes, axis = 0)
plt.figure(figsize=(10,8))
plt.title('Retention Rate')
sns.heatmap(data = retention,
annot = True,
fmt = '.0%',
vmin=0.0,
vmax =0.5,
cmap='BuGn')
df12 = df11[df11['CohortMonth'] != "2010-12-01"] # only keep last 12 months
snapshot_date = max(df12.InvoiceDate) + timedelta(days=1)
snapshot_date
df12['TotalSum'] = df12['Quantity'] * df12['UnitPrice']
datamart = df12.groupby(['CustomerID']).agg({
'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
'InvoiceNo': 'count',
'TotalSum': 'sum'})
datamart.rename(columns = {'InvoiceDate': 'Recency',
'InvoiceNo': 'Frequency',
'TotalSum': 'MonetaryValue'}, inplace = True)
datamart.head()
r_labels = range(4, 0, -1)
r_quartiles = pd.qcut(datamart['Recency'], 4, labels=r_labels)
datamart = datamart.assign(R = r_quartiles.values)
f_labels = range(1,5)
m_labels = range(1,5)
f_quartiles = pd.qcut(datamart['Frequency'], 4, labels=f_labels)
m_quartiles = pd.qcut(datamart['MonetaryValue'], 4, labels=m_labels)
datamart = datamart.assign(F = f_quartiles.values)
datamart = datamart.assign(M = m_quartiles.values)
def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])
datamart['RFM_Segment'] = datamart.apply(join_rfm, axis = 1)
datamart['RFM_Score'] = datamart[['R', 'F', 'M']].sum(axis=1)
datamart.head()
datamart.groupby('RFM_Score').agg({
'Recency': 'mean',
'Frequency': 'mean',
'MonetaryValue': ['mean', 'count']
}).round(1)
def segment_me(df):
if df['RFM_Score'] >= 9:
return 'Gold'
elif (df['RFM_Score'] >= 5) and (df['RFM_Score'] < 9):
return 'Silver'
else:
return 'Bronze'
datamart['General_Segment'] = datamart.apply(segment_me, axis = 1)
datamart.groupby('General_Segment').agg({
'Recency': 'mean',
'Frequency': 'mean',
'MonetaryValue': ['mean', 'count']
}).round(1)